library("tidyverse")
## Warning: пакет 'tidyverse' был собран под R версии 4.2.3
## Warning: пакет 'ggplot2' был собран под R версии 4.2.3
## Warning: пакет 'tibble' был собран под R версии 4.2.3
## Warning: пакет 'tidyr' был собран под R версии 4.2.3
## Warning: пакет 'readr' был собран под R версии 4.2.3
## Warning: пакет 'purrr' был собран под R версии 4.2.3
## Warning: пакет 'dplyr' был собран под R версии 4.2.3
## Warning: пакет 'stringr' был собран под R версии 4.2.3
## Warning: пакет 'forcats' был собран под R версии 4.2.3
## Warning: пакет 'lubridate' был собран под R версии 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
movieset <- read.csv("Top_rated_movies1.csv")
movieset$release_date <- as.Date(movieset$release_date)
Изучаем датасет:
rmarkdown::paged_table(movieset)
MSDate <- movieset|>
group_by(release_date)|>
summarize(movie_number = n())
ggplot(MSDate, aes(x = release_date, y = movie_number)) + theme_bw()+
geom_point()
Чем-то похоже на параболу.
summary(movieset$release_date)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## "1902-04-17" "1998-01-30" "2009-08-03" "2004-08-20" "2016-08-02" "2023-11-23"
range_year <- function(year){
if(year <= 1950)
return (paste("1902", "1950", sep='-'))
if (year < 2000)
return (paste(year - year%% 25, year - year %% 25 + 25, sep='-'))
if(year < 2020)
return(paste(year - year %% 5, year - year %% 5 + 5, sep='-'))
return (paste(year - year%%5, "2023", sep='-'))
}
MSPop <- movieset|>
group_by(title, popularity)|>
mutate(score = sum(vote_average*vote_count),
year = year(release_date),
month = month(release_date),
day = day(release_date))
MSRange <- MSPop|>
mutate(year.range = sapply(year, range_year))
table(MSRange$year.range)
##
## 1902-1950 1950-1975 1975-2000 2000-2005 2005-2010 2010-2015 2015-2020 2020-2023
## 144 479 1837 865 1218 1533 1833 922
summary(MSRange$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1902 1998 2009 2004 2016 2023
Производство фильмов по годам
MSYear <- MSPop|>
group_by(year)|>
summarise(year.number = n())
ggplot(MSYear, aes(x = year, y = log10(year.number)))+theme_classic()+
geom_line()
ggplot(MSYear, aes(x = year, y = year.number))+theme_classic()+
geom_line()
rmarkdown::paged_table(MSYear)
summary(MSYear$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1902 1943 1970 1969 1996 2023
summary(MSYear$year.number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 6.75 24.00 81.77 117.00 391.00
По месяцам:
MSMonth <- MSPop|>
group_by(month)|>
summarise(month.number = n())
barplot(MSMonth$month.number, names.arg = month.name[MSMonth$month],
xlab = "Month", ylab = "Movies produced",
main = "Movies produced from 1902 to 2023\nSelected by month")
summary(MSMonth$month.number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 533.0 645.5 702.5 735.9 801.0 991.0
По дням:
MSDay <- MSPop|>
group_by(day)|>
summarize(day.number = n())
barplot(MSDay$day.number, names.arg = MSDay$day,
xlab = "Day",
ylab = "Movies produced",
main = "Movies produced from 1902 to 2023\nSelected by day")+theme_classic()
## NULL
summary(MSDay$day.number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 136.0 255.0 292.0 284.9 317.0 347.0
Самый продуктивный год/месяц/день:
MSYear$year[which.max(MSYear$year.number)]
## [1] 2018
month.name[MSMonth$month[which.max(MSMonth$month.number)]]
## [1] "September"
MSDay$day[which.max(MSDay$day.number)]
## [1] 12
summary(MSPop$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 3032 5569 13825 13131 291052
library("GGally")
## Warning: пакет 'GGally' был собран под R версии 4.2.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(
MSRange|>mutate(popularity = log10(popularity)),
columns = which(names(MSRange) %in% c("score", "vote_average",
"vote_count", "popularity", "release_date")),
aes(
alpha = 0.5,
col = year.range
)
)
library("GGally")
ggpairs(
MSRange|>mutate(popularity = log10(popularity)),
columns = which(names(MSRange) %in% c("score", "vote_average",
"vote_count", "popularity", "year")),
aes(
alpha = 0.5,
col = year.range
)
)
library("GGally")
ggpairs(
MSRange|>mutate(popularity = log10(popularity)),
columns = which(names(MSRange) %in% c("score", "vote_average",
"vote_count", "popularity", "month")),
aes(
alpha = 0.5,
col = year.range
)
)
library("GGally")
ggpairs(
MSRange|>mutate(popularity = log10(popularity), vote_average = log10(vote_average), vote_count = log10(vote_count)),
columns = which(names(MSRange) %in% c("score", "vote_average",
"vote_count", "popularity", "day")),
aes(
alpha = 0.5,
col = year.range,
),
)
library("GGally")
ggpairs(
MSRange|>mutate(popularity = log10(popularity)),
columns = which(names(MSRange) %in% c("score", "vote_average",
"vote_count", "popularity", "id")),
aes(
alpha = 0.5,
col = year.range
)
)
MSRange|>
filter(year.range == "1902-1950")|>as.data.frame()|>
slice_max(order_by = popularity, n = 10)
## id title
## 1 11224 Cinderella
## 2 11360 Dumbo
## 3 408 Snow White and the Seven Dwarfs
## 4 630 The Wizard of Oz
## 5 10895 Pinocchio
## 6 3170 Bambi
## 7 1585 It's a Wonderful Life
## 8 770 Gone with the Wind
## 9 11881 Miracle on 34th Street
## 10 15 Citizen Kane
## overview
## 1 Cinderella has faith her dreams of a better life will come true. With help from her loyal mice friends and a wave of her Fairy Godmother's wand, Cinderella's rags are magically turned into a glorious gown and off she goes to the Royal Ball. But when the clock strikes midnight, the spell is broken, leaving a single glass slipper... the only key to the ultimate fairy-tale ending!
## 2 Dumbo is a baby elephant born with over-sized ears and a supreme lack of confidence. But thanks to his even more diminutive buddy Timothy the Mouse, the pint-sized pachyderm learns to surmount all obstacles.
## 3 A beautiful girl, Snow White, takes refuge in the forest in the house of seven dwarfs to hide from her stepmother, the wicked Queen. The Queen is jealous because she wants to be known as "the fairest in the land," and Snow White's beauty surpasses her own.
## 4 Young Dorothy finds herself in a magical world where she makes friends with a lion, a scarecrow and a tin man as they make their way along the yellow brick road to talk with the Wizard and ask for the things they miss most in their lives. The Wicked Witch of the West is the only thing that could stop them.
## 5 When loving Geppetto creates a wooden puppet, his wish is granted when it comes to life as a little wooden boy named Pinocchio. With his faithful friend and conscience Jiminy Cricket by his side, Pinocchio, embarks on fantastic adventures that his bravery, loyalty and honesty until triumphs in his triumphs in his quest for his heart's desire: to become a real boy.
## 6 Bambi's tale unfolds from season to season as the young prince of the forest learns about life, love, and friends.
## 7 A holiday favourite for generations... George Bailey has spent his entire life giving to the people of Bedford Falls. All that prevents rich skinflint Mr. Potter from taking over the entire town is George's modest building and loan company. But on Christmas Eve the business's $8,000 is lost and George's troubles begin.
## 8 The spoiled daughter of a Georgia plantation owner conducts a tumultuous romance with a cynical profiteer during the American Civil War and Reconstruction Era.
## 9 Kris Kringle, seemingly the embodiment of Santa Claus, is asked to portray the jolly old fellow at Macy's following his performance in the Thanksgiving Day parade. His portrayal is so complete that many begin to question if he truly is Santa Claus, while others question his sanity.
## 10 Newspaper magnate, Charles Foster Kane is taken from his mother as a boy and made the ward of a rich industrialist. As a result, every well-meaning, tyrannical or self-destructive move he makes for the rest of his life appears in some way to be a reaction to that deeply wounding event.
## popularity release_date vote_average vote_count score year month day
## 1 87.122 1950-02-22 7.042 6298 44350.516 1950 2 22
## 2 80.097 1941-10-31 6.997 4634 32424.098 1941 10 31
## 3 72.349 1937-12-21 7.122 6912 49227.264 1937 12 21
## 4 70.664 1939-08-15 7.575 5167 39140.025 1939 8 15
## 5 66.270 1940-02-23 7.103 5483 38945.749 1940 2 23
## 6 66.096 1942-08-14 7.009 5284 37035.556 1942 8 14
## 7 63.569 1946-12-20 8.263 3905 32267.015 1946 12 20
## 8 49.256 1939-12-15 7.968 3739 29792.352 1939 12 15
## 9 43.762 1947-06-04 7.336 648 4753.728 1947 6 4
## 10 40.691 1941-04-17 8.016 5102 40897.632 1941 4 17
## year.range
## 1 1902-1950
## 2 1902-1950
## 3 1902-1950
## 4 1902-1950
## 5 1902-1950
## 6 1902-1950
## 7 1902-1950
## 8 1902-1950
## 9 1902-1950
## 10 1902-1950
MSRange |>
filter(year.range == "1902-1950")|>as.data.frame()|>
slice_min(order_by = popularity, n = 10)
## id title
## 1 27040 Meshes of the Afternoon
## 2 37719 A Night at the Opera
## 3 10728 Faust
## 4 3059 Intolerance: Love's Struggle Throughout the Ages
## 5 28978 The Circus
## 6 776 The Rules of the Game
## 7 30588 Monsieur Verdoux
## 8 3086 The Lady Eve
## 9 212 Arsenic and Old Lace
## 10 17057 In a Lonely Place
## overview
## 1 A woman returning home falls asleep and has vivid dreams that may or may not be happening in reality. Through repetitive images and complete mismatching of the objective view of time and space, her dark inner desires play out on-screen.
## 2 The Marx Brothers take on high society and the opera world to bring two lovers together. A sly business manager and two wacky friends of two opera singers help them achieve success while humiliating their stuffy and snobbish enemies.
## 3 God and Satan war over earth; to settle things, they wager on the soul of Faust, a learned and prayerful alchemist.
## 4 The story of a poor young woman, separated by prejudice from her husband and baby, is interwoven with tales of intolerance from throughout history.
## 5 Charlie, a wandering tramp, becomes a circus handyman - soon the star of the show - and falls in love with the circus owner's stepdaughter.
## 6 A weekend at a marquis’ country château lays bare some ugly truths about a group of haut bourgeois acquaintances.
## 7 The film is about an unemployed banker, Henri Verdoux, and his sociopathic methods of attaining income. While being both loyal and competent in his work, Verdoux has been laid-off. To make money for his wife and child, he marries wealthy widows and then murders them. His crime spree eventually works against him when two particular widows break his normal routine.
## 8 It's no accident when wealthy Charles falls for Jean. Jean is a con artist with her sights set on Charles' fortune. Matters complicate when Jean starts falling for her mark. When Charles suspects Jean is a gold digger, he dumps her. Jean, fixated on revenge and still pining for the millionaire, devises a plan to get back in Charles' life. With love and payback on her mind, she re-introduces herself to Charles, this time as an aristocrat named Lady Eve Sidwich.
## 9 Mortimer Brewster, a newspaper drama critic, playwright, and author known for his diatribes against marriage, suddenly falls in love and gets married; but when he makes a quick trip home to tell his two maiden aunts, he finds out his aunts' hobby - killing lonely old men and burying them in the cellar!
## 10 An aspiring actress begins to suspect that her temperamental and mentally impaired boyfriend is a murderer.
## popularity release_date vote_average vote_count score year month day
## 1 8.144 1943-01-01 7.680 339 2603.520 1943 1 1
## 2 8.586 1935-11-15 7.438 442 3287.596 1935 11 15
## 3 9.864 1926-10-13 7.911 310 2452.410 1926 10 13
## 4 10.373 1916-09-04 7.103 306 2173.518 1916 9 4
## 5 10.429 1928-01-06 8.005 713 5707.565 1928 1 6
## 6 10.454 1939-07-09 7.592 520 3947.840 1939 7 9
## 7 10.632 1947-09-26 7.782 379 2949.378 1947 9 26
## 8 11.183 1941-02-25 7.217 307 2215.619 1941 2 25
## 9 11.212 1944-09-01 7.625 849 6473.625 1944 9 1
## 10 11.245 1950-05-17 7.548 506 3819.288 1950 5 17
## year.range
## 1 1902-1950
## 2 1902-1950
## 3 1902-1950
## 4 1902-1950
## 5 1902-1950
## 6 1902-1950
## 7 1902-1950
## 8 1902-1950
## 9 1902-1950
## 10 1902-1950
По названиям фильмов: зачастую мелькают слова по типу “The”, “A”. Интересно, как много тайтлов начинаются с них
library("stringr")
MSTitle <- movieset |>
mutate(firstWord = word(title, 1))
MSTitle |> group_by(firstWord)|>
summarise(total.fw = n(), ratio = n()/nrow(MSTitle))|>as.data.frame()|>
arrange(desc(total.fw))|>
rmarkdown::paged_table()
Такая вот интересная статистика
MSTitle |>
filter(firstWord == "American")|>as.data.frame()|>
rmarkdown::paged_table()
А что насчёт самых подробных тайтлов?
MSTitle <- MSTitle|>
mutate(title.length = sapply(title, function(x) nchar(x)),
title.word.length = sapply(title, function(x) length(strsplit(x, " ")[[1]])))
MSTitle|>group_by(title.word.length)|>
summarise(Number = n())|>as.data.frame()|>
arrange(desc(Number))|>
rmarkdown::paged_table()
MSTitle|>group_by(title.length)|>
summarise(Number = n())|>as.data.frame()|>
arrange(desc(Number))|>
rmarkdown::paged_table()
MSMinWL <- MSTitle|>group_by(title.word.length)|>
summarise(Number = n())|>as.data.frame()|>
slice_min(order_by = Number, n = 50)
MSMinL <- MSTitle|>group_by(title.length)|>
summarise(Number = n())|>as.data.frame()|>
slice_min(order_by = Number, n = 50)
MSTitle|>
filter(title.word.length %in% (MSMinWL|>pull(title.word.length)))|>
as.data.frame()|>arrange(desc(title.word.length))|>
rmarkdown::paged_table()
MSTitle|>
filter(title.length %in% (MSMinL|>pull(title.length)))|>
as.data.frame()|>arrange(desc(title.length))|>
rmarkdown::paged_table()
MSTitle|>
filter(title.length <= 10)|>
as.data.frame()|>arrange(title.length)|>
rmarkdown::paged_table()